Tutorial based in Real Python Post
In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Load files from Label datasets
filepath_dict = {'yelp': 'sentiment-labelled-sentences/yelp_labelled.txt',
'amazon': 'sentiment-labelled-sentences/amazon_cells_labelled.txt',
'imdb': 'sentiment-labelled-sentences/imdb_labelled.txt'}
# Concatenate the datasets in the same dataframe
df_list = []
for source, filepath in filepath_dict.items():
df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
df['source'] = source # Add another column filled with the source name
df_list.append(df)
df = pd.concat(df_list)
# Select the Yelp dataset
df_yelp = df[df['source'] == 'yelp']
# Get an Numpy array with the values of the sentense to generate the Training X dataset
sentences = df_yelp['sentence'].values
# Here we get the Y variable with the classification values (1,0)
y = df_yelp['label'].values
# Right now we split our dataset to 75% for training and 25% fo testing
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences,
y,
test_size=0.25,
random_state=1000)
In [2]:
'''
It takes the words of each sentence and creates a vocabulary of all the unique words in the sentences.
This vocabulary can then be used to create a feature vector of the count of the words
'''
'''
Here we will use again on the previous BOW model to vectorize the sentences.
You can use again the CountVectorizer for this task. Since you might not have
the testing data available during training, you can create the vocabulary
using only the training data. Using this vocabulary, you can create the
feature vectors for each sentence of the training and testing set
'''
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)
In [3]:
# Here we can see our complete sentence
sentences_train[1]
Out[3]:
In [4]:
# Here we have our sentence converted in matrix of token counts
print(X_train[1])
In [5]:
# If we use a Pandas dataframe just to see this dataset we have an array with 1713 positions with
# Binary indications of if there's a occurence of the word or not.
# This 1713 positions it's the size of the dictionary of all dataset.
# Acording with Scikit-Learning documentation, this is "a sparse representation of the
# counts using scipy.sparse.csr_matrix.""
pd.DataFrame(X_train[1].toarray())
Out[5]:
In [6]:
# In this case let's grab all the positions of or array.
# I'll transpose only to visualization purposes, and at the end
# I'll grab only the rows with value equals 1
sentence_df = pd.DataFrame(X_train[1].toarray()).T
sentence_df[sentence_df[0] == 1]
# As we can see, we grab only the words in the following positions in the dictionary.
Out[6]:
In [7]:
# In the test data we have only the categorical information
print(y_test[1])
In [8]:
# And this is our dictionary
vectorizer.vocabulary_
Out[8]:
In [9]:
# Now we'll test all datasources using Logistic Regression as a baseline.
for source in df['source'].unique():
df_source = df[df['source'] == source]
sentences = df_source['sentence'].values
y = df_source['label'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences,
y,
test_size=0.25,
random_state=1000)
# Create dictionary of an sparce matrix
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print('Accuracy for {} data: {:.4f}'.format(source, score))
In [10]:
# A Dense nwtwork fully connected using the Sequential API.
# Each layer has 10 neurons fully connected, and the actionation function
# to correct the gradients it's the "ReLu". To the final layer
# The activation function it's the sigmoid one.
from keras.models import Sequential
from keras import layers
# Definition of the features to be inputed in the model
# In this case we have 2505 features.
input_dim = X_train.shape[1]
# The Sequencial API to add the layers
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
# As we're dealing with a classification problem, we'll use
# a loss function of binary_crossentropy, with the Adam optimizer
# to enhance the convergence and the metric to be monitored to the
# model will be accuracy
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Model summary
model.summary()
# In this mode we'll train using 50 epochs and with a small batch size of 10
# It means that for every epoch a sample of 10 records will be used to be
# propagated through the network during the training phase.
history = model.fit(X_train,
y_train,
epochs=50,
verbose=False,
validation_data=(X_test, y_test),
batch_size=10)
# Model Evaluation: Training
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
# Model Evaluation: Test
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
In [11]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
def plot_history(history):
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
x = range(1, len(acc) + 1)
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(x, acc, 'b', label='Training acc')
plt.plot(x, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(x, loss, 'b', label='Training loss')
plt.plot(x, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
In [12]:
plot_history(history)
In [13]:
'''
Now you need to tokenize the data into a format that can be used by the word embeddings.
Keras offers a couple of convenience methods for text preprocessing and sequence
preprocessing which you can employ to prepare your text.
You can start by using the Tokenizer utility class which can vectorize a
text corpus into a list of integers. Each integer maps to a value in a dictionary
that encodes the entire corpus, with the keys in the dictionary being the
vocabulary terms themselves. You can add the parameter num_words, which is
responsible for setting the size of the vocabulary. The most common
num_words words will be then kept. I have the testing and training
data prepared from the previous example:
'''
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)
vocab_size = len(tokenizer.word_index) + 1 # Adding 1 because of reserved 0 index
print(sentences_train[2])
print(X_train[2])
'''
With CountVectorizer, we had stacked vectors of word counts,
and each vector was the same length (the size of the total
corpus vocabulary). With Tokenizer, the resulting vectors
equal the length of each text, and the numbers don’t denote
counts, but rather correspond to the word values from the
dictionary tokenizer.word_index.
'''
In [14]:
for word in ['the', 'all', 'good', 'nice']:
print('{}: {}'.format(word, tokenizer.word_index[word]))
In [15]:
# Here we can see our dictionaty where the words are the keys of our
# dictionary map, and the order of the words was based in their frequency
tokenizer.word_index
Out[15]:
In [16]:
'''
One problem that we have is that each text sequence has in most cases different
length of words. To counter this, you can use pad_sequence() which simply
pads the sequence of words with zeros. By default, it prepends zeros but
we want to append them. Typically it does not matter whether you prepend or append zeros.
Additionally you would want to add a maxlen parameter to specify how
long the sequences should be. This cuts sequences that exceed that number.
In the following code, you can see how to pad sequences with Keras
'''
from keras.preprocessing.sequence import pad_sequences
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
print(X_train[0, :])
In [17]:
'''
Now you can use the Embedding Layer of Keras which takes the
previously calculated integers and maps them to a dense vector of the
embedding. You will need the following parameters:
- input_dim: the size of the vocabulary
- output_dim: the size of the dense vector
- input_length: the length of the sequence
With the Embedding layer we have now a couple of options. One way
would be to take the output of the embedding layer and plug it into
a Dense layer. In order to do this you have to add a Flatten layer
in between that prepares the sequential input for the Dense layer:
'''
from keras.models import Sequential
from keras import layers
embedding_dim = 50
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size,
output_dim=embedding_dim,
input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
model.summary()
'''
You can now see that we have 87350 new parameters to train.
This number comes from vocab_size times the embedding_dim.
These weights of the embedding layer are initialized with random
weights and are then adjusted through backpropagation during training.
This model takes the words as they come in the order of the
sentences as input vectors. You can train it with the following:
'''
history = model.fit(X_train, y_train,
epochs=20,
verbose=False,
validation_data=(X_test, y_test),
batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
plot_history(history)
In [18]:
'''
Another way to work with embeddings is by using a MaxPooling1D/AveragePooling1D
or a GlobalMaxPooling1D/GlobalAveragePooling1D layer after the
embedding. You can think of the pooling layers as a way to
downsample (a way to reduce the size of) the incoming feature vectors.
In the case of max pooling you take the maximum value of all
features in the pool for each feature dimension. In the case
of average pooling you take the average, but max pooling seems
to be more commonly used as it highlights large values.
Global max/average pooling takes the maximum/average of all
features whereas in the other case you have to define the
pool size. Keras has again its own layer that you can add in the sequential model:
'''
from keras.models import Sequential
from keras import layers
embedding_dim = 50
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size,
output_dim=embedding_dim,
input_length=maxlen))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
model.summary()
history = model.fit(X_train, y_train,
epochs=50,
verbose=False,
validation_data=(X_test, y_test),
batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
plot_history(history)
In [19]:
'''
The word embeddings do not understand the text as a human would, but they rather map
the statistical structure of the language used in the corpus. Their aim is to map
semantic meaning into a geometric space. This geometric space is then
called the embedding space.
'''
import numpy as np
def create_embedding_matrix(filepath, word_index, embedding_dim):
vocab_size = len(word_index) + 1 # Adding again 1 because of reserved 0 index
embedding_matrix = np.zeros((vocab_size, embedding_dim))
with open(filepath) as f:
for line in f:
word, *vector = line.split()
if word in word_index:
idx = word_index[word]
embedding_matrix[idx] = np.array(
vector, dtype=np.float32)[:embedding_dim]
return embedding_matrix
In [20]:
# Retrieve the embedding matrix
embedding_dim = 50
embedding_matrix = create_embedding_matrix('/Users/flavioclesio/Desktop/programming-study/machine-learning/nlp/glove.6B/glove.6B.50d.txt',
tokenizer.word_index,
embedding_dim)
In [21]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size
Out[21]:
In [22]:
'''
Let’s have a look at the performance when using the
GlobalMaxPool1D layer:
'''
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim,
weights=[embedding_matrix],
input_length=maxlen,
trainable=False))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
model.summary()
history = model.fit(X_train, y_train,
epochs=50,
verbose=False,
validation_data=(X_test, y_test),
batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
plot_history(history)
In [24]:
'''
Since the word embeddings are not additionally trained, it is expected to be lower.
But let’s now see how this performs if we allow the embedding to be
trained by using trainable=True:
'''
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim,
weights=[embedding_matrix],
input_length=maxlen,
trainable=True))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
model.summary()
history = model.fit(X_train, y_train,
epochs=250,
verbose=False,
validation_data=(X_test, y_test),
batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
plot_history(history)
In [26]:
# CNN
embedding_dim = 100
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
model.summary()
history = model.fit(X_train, y_train,
epochs=10,
verbose=False,
validation_data=(X_test, y_test),
batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
plot_history(history)
In [27]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
return model
In [28]:
param_grid = dict(num_filters=[32, 64],
kernel_size=[5, 7],
vocab_size=[5000],
embedding_dim=[50],
maxlen=[100])
In [29]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
# Main settings
epochs = 20
embedding_dim = 50
maxlen = 100
output_file = 'sentiment-labelled-sentences/output.txt'
# Run grid search for each source (yelp, amazon, imdb)
for source, frame in df.groupby('source'):
print('Running grid search for data set :', source)
sentences = df['sentence'].values
y = df['label'].values
# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(
sentences, y, test_size=0.25, random_state=1000)
# Tokenize words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1
# Pad sequences with zeros
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
# Parameter grid for grid search
param_grid = dict(num_filters=[32, 64, 128],
kernel_size=[3, 5, 7],
vocab_size=[vocab_size],
embedding_dim=[embedding_dim],
maxlen=[maxlen])
model = KerasClassifier(build_fn=create_model,
epochs=epochs, batch_size=10,
verbose=False)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
cv=4, verbose=1, n_iter=5)
grid_result = grid.fit(X_train, y_train)
# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)
# Save and evaluate results
prompt = input(f'finished {source}; write to file and proceed? [y/n]')
if prompt.lower() not in {'y', 'true', 'yes'}:
break
with open(output_file, 'a') as f:
s = ('Running {} data set\nBest Accuracy : '
'{:.4f}\n{}\nTest Accuracy : {:.4f}\n\n')
output_string = s.format(
source,
grid_result.best_score_,
grid_result.best_params_,
test_accuracy)
print(output_string)
f.write(output_string)